package com.kdtech.analyse.NewsPaper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * http://fzsb.hinews.cn/php/index.php 法制时报
 * http://ndwb.hinews.cn/html/2012-12/04/node_462.htm 南岛晚报
 * http://zqdb.hinews.cn/html/2012-10/26/node_542.htm 证券导报
 * http://ngdsb.hinews.cn/html/2012-12/04/node_202.htm 南国都市报
 * @author KK
 *
 */
public class HinewsNewsPaperAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		String[] regex = {
				"http://(hnrb|ndwb|fzsb|ngdsb).hinews.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*_[0-9]*.htm"
				};
		return RegexUtils.matchAny(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		String html = urlMeta.getHtml();
		String url = urlMeta.getUrl();


		String title = null;
		String content = null;
		Long date = null;
		Document doc = Jsoup.parse(html);
		title =doc.select("td.bt1").text();
		date=DateUtils.matchDate(url);
		content =HtmlCleaner.getContentHtml(url,doc.select("div#ozoom founder-content"));

		if (StringUtils.isNullOrEmpty(title)){
			title = doc.select("html body div table tbody tr td table tbody tr td table tbody tr td table tbody tr td p font font").text();
			content = HtmlCleaner.getContentHtml(url,doc.select("html body div table tbody tr td table tbody tr td table tbody tr td table tbody tr td p"));
		}
		if (StringUtils.isNullOrEmpty(title)){
			title = doc.select("founder-title").text();
			content = HtmlCleaner.getContentHtml(url,doc.select("founder-content"));
		}

			NewsMeta newspaper = new NewsMeta();
			newspaper.setUrl(url);
			newspaper.setTitle(StringUtils.trimSpace(title));
			newspaper.setContent(StringUtils.trimSpace(content));
			newspaper.setDate(date);
			return newspaper;


	}

	
	public boolean isTaskPage(String url) {
		String[] regex = {
				"http://(hnrb|ndwb|fzsb|ngdsb).hinews.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm"
			
				};
		return RegexUtils.matchAny(url, regex);
	}

	
}
